{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "5f93b7d1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/zhy/anaconda3/envs/mathglm/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /tmp/jieba.cache\n",
      "Loading model cost 0.383 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import mindspore\n",
    "from mindnlp.transformers import AutoModelForSeq2SeqLM\n",
    "from mindnlp.peft import get_peft_config, get_peft_model, get_peft_model_state_dict, LoraConfig, TaskType, PeftModel\n",
    "from mindnlp.dataset import load_dataset\n",
    "from mindnlp.core import ops\n",
    "\n",
    "from mindnlp.transformers import AutoTokenizer\n",
    "from mindnlp.common.optimization import get_linear_schedule_with_warmup\n",
    "from tqdm import tqdm\n",
    "\n",
    "model_name_or_path = \"bigscience/mt0-small\"\n",
    "tokenizer_name_or_path = \"bigscience/mt0-small\"\n",
    "\n",
    "checkpoint_name = \"financial_sentiment_analysis_lora_v1.ckpt\"\n",
    "max_length = 128\n",
    "lr = 1e-3\n",
    "num_epochs = 3\n",
    "batch_size = 8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "8d0850ac",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "trainable params: 344,064 || all params: 300,520,832 || trainable%: 0.11448923447676333\n"
     ]
    }
   ],
   "source": [
    "# creating model\n",
    "peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, use_dora=True)\n",
    "\n",
    "model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)\n",
    "\n",
    "model = PeftModel(model, peft_config)\n",
    "model.print_trainable_parameters()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "4ee2babf",
   "metadata": {},
   "outputs": [],
   "source": [
    "# mindspore.dataset.config.set_seed(123)\n",
    "# loading dataset\n",
    "dataset = load_dataset(\"financial_phrasebank\", \"sentences_allagree\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "d1ecd7a0-2e48-405e-940b-001a96ccc18f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['negative', 'neutral', 'positive']"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classes = dataset.source.ds.features[\"label\"].names\n",
    "classes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "8cbbf475-89b6-4368-a7eb-730dc532558e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[WARNING] ME(3815480:131168514225088,MainProcess):2024-09-11-22:54:10.179.836 [mindspore/dataset/engine/datasets.py:1203] Dataset is shuffled before split.\n"
     ]
    }
   ],
   "source": [
    "train_dataset, validation_dataset = dataset.shuffle(64).split([0.9, 0.1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "a89748b4-3193-4419-be39-2dd7907e349e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_text_label(sentence, label):\n",
    "    return sentence, label, classes[label.item()]\n",
    "\n",
    "train_dataset = train_dataset.map(add_text_label, ['sentence', 'label'], ['sentence', 'label', 'text_label'])\n",
    "validation_dataset = validation_dataset.map(add_text_label, ['sentence', 'label'], ['sentence', 'label', 'text_label'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "396f46c4-45ba-4d45-a98f-fff173038b5b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'sentence': Tensor(shape=[], dtype=String, value= 'Nokia Messaging 1.1 enables customers to receive e-mails from up to 10 e-mail accounts on their mobile phone supporting all POP or IMAP e-mail services .'),\n",
       " 'label': Tensor(shape=[], dtype=Int64, value= 1),\n",
       " 'text_label': Tensor(shape=[], dtype=String, value= 'neutral')}"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "next(train_dataset.create_dict_iterator())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "603ea9ef-88dd-4c8e-bd10-df7a64940b01",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/zhy/anaconda3/envs/mathglm/lib/python3.9/site-packages/mindnlp/transformers/tokenization_utils_base.py:1526: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted, and will be then set to `False` by default. \n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "86d29e1a-eb31-48bf-a6e3-bc3cfa26cde5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from mindnlp.dataset import BaseMapFunction\n",
    "from threading import Lock\n",
    "lock = Lock()\n",
    "\n",
    "class MapFunc(BaseMapFunction):\n",
    "    def __call__(self, sentence, label, text_label):\n",
    "        lock.acquire()\n",
    "        model_inputs = tokenizer(sentence, max_length=max_length, padding=\"max_length\", truncation=True)\n",
    "        labels = tokenizer(text_label, max_length=3, padding=\"max_length\", truncation=True)\n",
    "        lock.release()\n",
    "        labels = labels['input_ids']\n",
    "        labels = np.where(np.equal(labels, tokenizer.pad_token_id), -100, labels)\n",
    "        return model_inputs['input_ids'], model_inputs['attention_mask'], labels\n",
    "\n",
    "\n",
    "def get_dataset(dataset, tokenizer, shuffle=True):\n",
    "    input_colums=['sentence', 'label', 'text_label']\n",
    "    output_columns=['input_ids', 'attention_mask', 'labels']\n",
    "    dataset = dataset.map(MapFunc(input_colums, output_columns),\n",
    "                          input_colums, output_columns)\n",
    "    if shuffle:\n",
    "        dataset = dataset.shuffle(64)\n",
    "    dataset = dataset.batch(batch_size)\n",
    "    return dataset\n",
    "\n",
    "train_dataset = get_dataset(train_dataset, tokenizer)\n",
    "eval_dataset = get_dataset(validation_dataset, tokenizer, shuffle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "f733a3c6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AdamW (\n",
      "Parameter Group 0\n",
      "    amsgrad: False\n",
      "    betas: (0.9, 0.999)\n",
      "    eps: 1e-08\n",
      "    lr: 0.001\n",
      "    maximize: False\n",
      "    weight_decay: 0.01\n",
      ")\n"
     ]
    }
   ],
   "source": [
    "from mindnlp.core import optim\n",
    "# optimizer and lr scheduler\n",
    "optimizer = optim.AdamW(model.trainable_params(), lr=lr)\n",
    "print(optimizer)\n",
    "lr_scheduler = get_linear_schedule_with_warmup(\n",
    "    optimizer=optimizer,\n",
    "    num_warmup_steps=0,\n",
    "    num_training_steps=(len(train_dataset) * num_epochs),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b3a4090",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 255/255 [1:31:39<00:00, 21.57s/it]\n",
      "100%|██████████| 29/29 [00:03<00:00,  7.93it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch=0: train_ppl=Tensor(shape=[], dtype=Float32, value= 1.25075) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.223744) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.08663) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0830831)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 255/255 [3:21:58<00:00, 47.52s/it]  \n",
      "100%|██████████| 29/29 [00:03<00:00,  9.11it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch=1: train_ppl=Tensor(shape=[], dtype=Float32, value= 1.18346) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.168442) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.07791) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0750237)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 255/255 [2:24:43<00:00, 34.05s/it]  \n",
      "100%|██████████| 29/29 [00:03<00:00,  9.06it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch=2: train_ppl=Tensor(shape=[], dtype=Float32, value= 1.13888) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.130047) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.05385) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0524506)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "from mindnlp.core import value_and_grad\n",
    "# training and evaluation\n",
    "def forward_fn(**batch):\n",
    "    outputs = model(**batch)\n",
    "    loss = outputs.loss\n",
    "    return loss\n",
    "\n",
    "grad_fn = value_and_grad(forward_fn, model.trainable_params(), attach_grads=True)\n",
    "\n",
    "for epoch in range(num_epochs):\n",
    "    model.set_train()\n",
    "    total_loss = 0\n",
    "    train_total_size = train_dataset.get_dataset_size()\n",
    "    for step, batch in enumerate(tqdm(train_dataset.create_dict_iterator(), total=train_total_size)):\n",
    "        optimizer.zero_grad()\n",
    "        loss = grad_fn(**batch)\n",
    "        optimizer.step()\n",
    "        total_loss += loss.float()\n",
    "        lr_scheduler.step()\n",
    "\n",
    "    model.set_train(False)\n",
    "    eval_loss = 0\n",
    "    eval_preds = []\n",
    "    eval_total_size = eval_dataset.get_dataset_size()\n",
    "    for step, batch in enumerate(tqdm(eval_dataset.create_dict_iterator(), total=eval_total_size)):\n",
    "        with mindspore._no_grad():\n",
    "            outputs = model(**batch)\n",
    "        loss = outputs.loss\n",
    "        eval_loss += loss.float()\n",
    "        eval_preds.extend(\n",
    "            tokenizer.batch_decode(ops.argmax(outputs.logits, -1).asnumpy(), skip_special_tokens=True)\n",
    "        )\n",
    "\n",
    "    eval_epoch_loss = eval_loss / len(eval_dataset)\n",
    "    eval_ppl = ops.exp(eval_epoch_loss)\n",
    "    train_epoch_loss = total_loss / len(train_dataset)\n",
    "    train_ppl = ops.exp(train_epoch_loss)\n",
    "    print(f\"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "id": "6cafa67b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy=47.34513274336283 % on the evaluation dataset\n",
      "eval_preds[:10]=['positive', 'positivenegative', 'neutral', 'neutral', 'neutral', 'neutral', 'positive', 'neutral', 'positive', 'neutral']\n",
      "ground_truth[:10]=['neutral', 'negative', 'neutral', 'neutral', 'neutral', 'positive', 'positive', 'positive', 'neutral', 'positive']\n"
     ]
    }
   ],
   "source": [
    "# print accuracy\n",
    "correct = 0\n",
    "total = 0\n",
    "\n",
    "ground_truth = []\n",
    "\n",
    "for pred, data in zip(eval_preds, validation_dataset.create_dict_iterator(output_numpy=True)):\n",
    "    true = str(data['text_label'])\n",
    "    ground_truth.append(true)\n",
    "    if pred.strip() == true.strip():\n",
    "        correct += 1\n",
    "    total += 1\n",
    "accuracy = correct / total * 100\n",
    "print(f\"{accuracy=} % on the evaluation dataset\")\n",
    "print(f\"{eval_preds[:10]=}\")\n",
    "print(f\"{ground_truth[:10]=}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mathglm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
